Python code group assignment TIL6022¶

Group 19¶

In [1]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from time import strftime
from datetime import datetime
from sklearn.linear_model import LinearRegression
from scipy import stats
%matplotlib inline

Fuel prices¶

In [2]:
# Import the data of prices of fossil fuels
fuel_file_path = r"C:\Users\skuppers\Downloads\Data project\80416ENG_UntypedDataSet_12102022_091823.csv"
df1 = pd.read_csv(fuel_file_path, delimiter= ';')
df1.head()
Out[2]:
ID Periods Euro95_1 Diesel_2 LPG_3
0 0 20060101 1.325 1.003 0.543
1 1 20060102 1.328 1.007 0.542
2 2 20060103 1.332 1.007 0.540
3 3 20060104 1.348 1.020 0.550
4 4 20060105 1.347 1.021 0.550
In [3]:
# Processing all data of fossil fuels into data of only the year 2022

df1 = df1[df1['Periods'] > 20220000]
df1.rename(columns = {'Euro95_1' : 'Euro95', 'Diesel_2' : 'Diesel', 'LPG_3': 'LPG'}, inplace = True)

# Add another formats of dates to the dataframe

df1['Dates'] = pd.date_range(start='01/01/2022', periods=len(df1), freq='D')

df1['Date'] = df1['Dates'].dt.strftime('%d/%m/%Y')

    
df1.head()
Out[3]:
ID Periods Euro95 Diesel LPG Dates Date
5844 5844 20220101 1.974 1.613 0.912 2022-01-01 01/01/2022
5845 5845 20220102 1.975 1.615 0.910 2022-01-02 02/01/2022
5846 5846 20220103 1.976 1.620 0.915 2022-01-03 03/01/2022
5847 5847 20220104 1.977 1.620 0.911 2022-01-04 04/01/2022
5848 5848 20220105 1.977 1.622 0.904 2022-01-05 05/01/2022
In [4]:
# Prices of fossil fuels and some events during the year of 2022

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# Plot fuel prices over the course of 2022

fig1 = px.line(df1, x = 'Date', y = ['Euro95', 'Diesel', 'LPG'] ,
               hover_data={'Periods':False, 'Dates' : False, 'Date' : False})

# Add the months as values to the x-axis

fig1.update_layout(title =  'Fuel prices', xaxis_title="Date", yaxis_title = 'Fuel price in euros',
                  xaxis = dict( tickmode = 'array', ticktext = months, 
                               tickvals = ['01/01/2022', '01/02/2022', '01/03/2022', '01/04/2022',
                                           '01/05/2022','01/06/2022','01/07/2022','01/08/2022',
                                           '01/09/2022','01/10/2022']))

# Add the significant events influencing the fuel prices to the plot
fig1.add_annotation(x= '24/02/2022', y=0.7, ax='24/02/2022', ay=2.6,text='Start war',
                    xref='x', yref='y', axref='x', ayref='y')

fig1.add_annotation(x='01/04/2022', y=0.7, ax='01/04/2022', ay=2.6, text='Fuel tax reduction',
                    xref='x', yref='y', axref='x', ayref='y')

fig1.add_annotation(x= '11/07/2022', y=0.7, ax='11/07/2022', ay=2.6, text='Maintenance Nord Stream',
                    xref='x', yref='y', axref='x', ayref='y')

fig1.add_annotation(x= '27/09/2022', y=0.7, ax='27/09/2022', ay=2.6, text='Explosion nord stream',
                    xref='x', yref='y', axref='x', ayref='y')

fig1.update_layout(hovermode="x unified")

# Months are indicated on every first day
fig1.show()

# Store figure 1 for main document
%store fig1
Stored 'fig1' (Figure)
In [ ]:
 

Google mobility report¶

In [5]:
# Import the data form the Google Mobility Report
df2 = pd.read_csv(r"C:\Users\skuppers\Downloads\Data project\2022_NL_Region_Mobility_Report.csv")
df2 = df2[df2['sub_region_1'].isnull()]

df2.rename(columns = {'retail_and_recreation_percent_change_from_baseline':'retail_and_recreation',
                     'grocery_and_pharmacy_percent_change_from_baseline':'grocery_and_pharmacy',
                     'parks_percent_change_from_baseline': 'parks',
                     'transit_stations_percent_change_from_baseline': 'transit_stations',
                     'workplaces_percent_change_from_baseline': 'workplaces', 
                     'residential_percent_change_from_baseline': 'residential'}, inplace = True)


df2.head()
Out[5]:
country_region_code country_region sub_region_1 sub_region_2 metro_area iso_3166_2_code census_fips_code place_id date retail_and_recreation grocery_and_pharmacy parks transit_stations workplaces residential
0 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-01 -78.0 -72.0 11.0 -53.0 -49.0 9.0
1 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-02 -51.0 5.0 9.0 -41.0 -12.0 6.0
2 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-03 -35.0 7.0 22.0 -51.0 -48.0 15.0
3 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-04 -38.0 8.0 13.0 -52.0 -46.0 15.0
4 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-05 -44.0 0.0 -6.0 -52.0 -44.0 14.0
In [6]:
# Figure from Google mobility
fig2 = px.line(df2, x = 'date', y = ['retail_and_recreation', 'grocery_and_pharmacy', 'parks', 
                                     'transit_stations', 'workplaces', 'residential'])

fig2.show()

# Store figure 2 for main document
%store fig2
Stored 'fig2' (Figure)
In [7]:
## Create new columns with a 7 day average. 
## The 7 day average must give a better view of the trend. 


df2['7_day_average_ret_and_rec'] = df2['retail_and_recreation'].rolling(window=7).mean()

df2['7_day_average_gro_and_phar'] = df2['grocery_and_pharmacy'].rolling(window=7).mean()

df2['7_day_average_parks'] = df2['parks'].rolling(window=7).mean()

df2['7_day_average_tran_stat'] = df2['transit_stations'].rolling(window=7).mean()

df2['7_day_average_work'] = df2['workplaces'].rolling(window=7).mean()

df2['7_day_average_resi'] = df2['residential'].rolling(window=7).mean()


df2.head(10)
Out[7]:
country_region_code country_region sub_region_1 sub_region_2 metro_area iso_3166_2_code census_fips_code place_id date retail_and_recreation ... parks transit_stations workplaces residential 7_day_average_ret_and_rec 7_day_average_gro_and_phar 7_day_average_parks 7_day_average_tran_stat 7_day_average_work 7_day_average_resi
0 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-01 -78.0 ... 11.0 -53.0 -49.0 9.0 NaN NaN NaN NaN NaN NaN
1 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-02 -51.0 ... 9.0 -41.0 -12.0 6.0 NaN NaN NaN NaN NaN NaN
2 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-03 -35.0 ... 22.0 -51.0 -48.0 15.0 NaN NaN NaN NaN NaN NaN
3 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-04 -38.0 ... 13.0 -52.0 -46.0 15.0 NaN NaN NaN NaN NaN NaN
4 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-05 -44.0 ... -6.0 -52.0 -44.0 14.0 NaN NaN NaN NaN NaN NaN
5 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-06 -41.0 ... 26.0 -51.0 -44.0 15.0 NaN NaN NaN NaN NaN NaN
6 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-07 -47.0 ... -5.0 -51.0 -42.0 15.0 -47.714286 -6.571429 10.000000 -50.142857 -40.714286 12.714286
7 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-08 -56.0 ... -28.0 -48.0 -17.0 10.0 -44.571429 3.142857 4.428571 -49.428571 -36.142857 12.857143
8 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-09 -51.0 ... 14.0 -40.0 -9.0 6.0 -44.571429 2.714286 5.142857 -49.285714 -35.714286 12.857143
9 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-10 -39.0 ... 8.0 -46.0 -29.0 11.0 -45.142857 2.000000 3.142857 -48.571429 -33.000000 12.285714

10 rows × 21 columns

In [8]:
# Setting up the figure
plot_rows=6
plot_cols=1
fig3 = make_subplots(rows=plot_rows, cols=plot_cols, 
                     subplot_titles=('retail and recreation', 'grocery and pharmacy',
                                     'parks', 'transit stations', 
                                     'workplaces', 'residential'))

# Add traces
col_names = ['7_day_average_ret_and_rec', '7_day_average_gro_and_phar', '7_day_average_parks',
             '7_day_average_tran_stat', '7_day_average_work', '7_day_average_resi']

for i in range(1, plot_rows+1):
    for j in range(1, plot_cols+1):
        fig3.add_trace(go.Scatter(x=df2['date'], y=df2[col_names[i-1]].values,
                                 name=col_names[i-1],
                                 mode='lines'), 
                      row=i, 
                      col=j)

        
# Format and show fig
fig3.update_layout(height=2000, width=900, title_text="Google mobility plots by motive")
fig3.show()


# Store figure 3 for main document
%store fig3
Stored 'fig3' (Figure)
In [9]:
# Add dates to the dataframe

df2['Dates'] = pd.date_range(start='01/01/2022', periods=len(df2), freq='D')

df2.head()
Out[9]:
country_region_code country_region sub_region_1 sub_region_2 metro_area iso_3166_2_code census_fips_code place_id date retail_and_recreation ... transit_stations workplaces residential 7_day_average_ret_and_rec 7_day_average_gro_and_phar 7_day_average_parks 7_day_average_tran_stat 7_day_average_work 7_day_average_resi Dates
0 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-01 -78.0 ... -53.0 -49.0 9.0 NaN NaN NaN NaN NaN NaN 2022-01-01
1 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-02 -51.0 ... -41.0 -12.0 6.0 NaN NaN NaN NaN NaN NaN 2022-01-02
2 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-03 -35.0 ... -51.0 -48.0 15.0 NaN NaN NaN NaN NaN NaN 2022-01-03
3 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-04 -38.0 ... -52.0 -46.0 15.0 NaN NaN NaN NaN NaN NaN 2022-01-04
4 NL Netherlands NaN NaN NaN NaN NaN ChIJu-SH28MJxkcRnwq9_851obM 2022-01-05 -44.0 ... -52.0 -44.0 14.0 NaN NaN NaN NaN NaN NaN 2022-01-05

5 rows × 22 columns

In [10]:
# Adding the monthly averages into new dataframes

for i in range(1,10):
    file_name = 'df_month%i' %i
    locals()[file_name] = df2[df2['Dates'].dt.strftime('%Y-%m') == '2022-0%i' %i]

dfmaand10 = df2[df2['Dates'].dt.strftime('%Y-%m') == '2022-10']


    
for i in range(1,10):
    file_name2 = 'avg_month%i' %i
    locals()[file_name2] = {}
    
    locals()[file_name2]['retail_and_reccreation'] = (locals()['df_month%i' %i].iloc[:,9].sum() 
                                                      / len(locals()['df_month%i' %i]))
    locals()[file_name2]['grocery_and_pharmacy'] = (locals()['df_month%i' %i].iloc[:,10].sum() 
                                                    / len(locals()['df_month%i' %i]))
    locals()[file_name2]['parks'] = (locals()['df_month%i' %i].iloc[:,11].sum() 
                                     / len(locals()['df_month%i' %i]))
    locals()[file_name2]['transit_stations'] = (locals()['df_month%i' %i].iloc[:,12].sum() 
                                                / len(locals()['df_month%i' %i]))
    locals()[file_name2]['workplaces'] = (locals()['df_month%i' %i].iloc[:,13].sum() 
                                          / len(locals()['df_month%i' %i]))
    locals()[file_name2]['residential'] = (locals()['df_month%i' %i].iloc[:,14].sum() 
                                           / len(locals()['df_month%i' %i]))
    
In [11]:
# Adding the montly averages to their categories

categories = list(avg_month1.keys())

for j in range(len(categories)):
    file_name4 = categories[j]
    locals()[file_name4] = []
    for i in range(1,10):
        file_name3 = 'avg_month%i' %i
        locals()[file_name4] += [locals()[file_name3][categories[j]]]
In [12]:
# Renaming and checking the new dataframe

data1 = {'retail_and_recreation' : retail_and_reccreation, 'grocery_and_pharmacy':grocery_and_pharmacy, 
        'parks' : parks, 'transit_stations': transit_stations, 
        'workplaces':workplaces, 'residential': residential}

df3 = pd.DataFrame(data1)

df3.head()
Out[12]:
retail_and_recreation grocery_and_pharmacy parks transit_stations workplaces residential
0 -34.838710 0.806452 6.774194 -42.032258 -24.741935 9.548387
1 -15.142857 2.071429 12.642857 -32.964286 -19.392857 6.285714
2 -7.387097 2.870968 33.193548 -26.548387 -16.580645 3.935484
3 -0.833333 7.366667 71.933333 -18.666667 -17.333333 2.766667
4 2.677419 10.612903 90.193548 -14.741935 -15.290323 1.064516
In [13]:
# Plotting the bar graph with the monthly averages

fig4 = px.bar(df3, x = df3.index, y = ['retail_and_recreation', 'grocery_and_pharmacy', 
                                       'parks', 'transit_stations', 
                                       'workplaces', 'residential'])

newnames = {'retail_and_recreation' : 'Retail and reccreation', 'grocery_and_pharmacy': 'Grocery and pharmacy', 
            'parks' : 'Parks', 'transit_stations': 'Transit stations', 
            'workplaces': 'Workplaces', 'residential': 'Residential'}

fig4.for_each_trace(lambda t: t.update(name=newnames[t.name],
                                      legendgroup=newnames[t.name],
                                      hovertemplate=t.hovertemplate.replace(t.name, newnames[t.name])))

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug', 'Sep', 'Oct']
fig4.update_layout(title='Mobility development per category', 
                   xaxis_title="Date", yaxis_title='Percentual change mobiity' , 
                   xaxis=dict( tickmode = 'array', ticktext=months, tickvals=df3.index))

fig4.show()

# Store fig4 for main document
%store fig4
Stored 'fig4' (Figure)
In [14]:
# Setting up the graph for the regression of the movements to transit stations

counter = []
for i in range(len(df2['transit_stations'])):
    counter += [i]
df2['counter'] = counter
    
df2.head()


x1 = np.array(counter)
x2 = x1.reshape(-1, 1)
y = np.array(df2['transit_stations'])
z = np.array(df2['Dates'])

reg = LinearRegression().fit(x2, y)


m, b = np.polyfit(x1, y, 1)


plt.xticks(ticks = [0,30,58,89,119,150,180,211,241,272], labels = months)
plt.plot(x1, y, 'o')
plt.plot(x1, m * x1 + b)
plt.xlabel("Date")
plt.ylabel("Percentage change compared to baseline")
plt.title("Movements change involving transit stations compared to baseline")
plt.tight_layout()


# Store fig5 for main document

plt.savefig('fig5.png')

Mobility report and fuel prices¶

In [15]:
# Caclculating correlation between movements and fuel prices

df5 = df2.drop(len(df2)-1)

correl = pd.DataFrame()
correl['Euro95'] = df1['Euro95']
correl['Diesel'] = df1['Diesel']

# Setting up correlation matrix

corfix = []
for i in range(5843,6119):
    corfix += [i]
df5['corfix'] = corfix
df6 = df5.set_index('corfix')



correl['retail_and_recreation'] = df6['retail_and_recreation']
correl['grocery_and_pharmacy'] = df6['grocery_and_pharmacy']
correl['parks'] = df6['parks']
correl['transit-stations'] = df6 ['transit_stations']
correl['workplaces'] = df6['workplaces']
correl['residential'] = df6['residential']



cormat = correl.corr()
round(cormat,2)
Out[15]:
Euro95 Diesel retail_and_recreation grocery_and_pharmacy parks transit-stations workplaces residential
Euro95 1.00 0.72 0.37 0.06 0.20 0.24 0.14 -0.35
Diesel 0.72 1.00 0.65 0.32 0.44 0.54 0.18 -0.58
retail_and_recreation 0.37 0.65 1.00 0.61 0.69 0.58 -0.01 -0.52
grocery_and_pharmacy 0.06 0.32 0.61 1.00 0.42 0.38 0.10 -0.35
parks 0.20 0.44 0.69 0.42 1.00 0.45 -0.19 -0.35
transit-stations 0.24 0.54 0.58 0.38 0.45 1.00 0.67 -0.92
workplaces 0.14 0.18 -0.01 0.10 -0.19 0.67 1.00 -0.75
residential -0.35 -0.58 -0.52 -0.35 -0.35 -0.92 -0.75 1.00
In [16]:
# Plotting correlation heatmap

fig6, ax = plt.subplots() 
sns.heatmap(cormat, ax=ax)     
plt.tight_layout()
fig6.savefig("fig6.png")

# Store fig6 for main document

%store fig6
Stored 'fig6' (Figure)

Traffic jams¶

In [22]:
# Read all excel files with data on dutch traffic jams and assign them a name file + number of month

months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September']

for i in range(1, 10):
    file_name = 'file_%i' % i
    locals()[file_name] = pd.read_excel(
        r"C:\Users\skuppers\Downloads\Data project\File " + months[i-1] + ".xlsx")

    
# Display head of january as example  
file_1.head(5)
Out[22]:
NLSitNummer DatumFileBegin DatumFileEind TijdFileBegin TijdFileEind FileZwaarte GemLengte FileDuur HectometerKop HectometerStaart ... TrajVan TrajNaar OorzaakGronddetail OorzaakVerloop OorzaakCodeVerloop OorzaakCode Oorzaak_1 Oorzaak_2 Oorzaak_3 Oorzaak_4
0 2922750 2022-01-05 2022-01-05 12:42:00 12:48:27 16.770 2600.000 6.450 8.5 11.1 ... Aken Geleen File buiten spits (geen oorzaak gemeld) [Geen oorzaakcode opgegeven door VWM 6] [000] 3 File buiten spits (geen oorzaak gemeld) Geen oorzaak gemeld Drukte Hoge intensiteit
1 2922758 2022-01-05 2022-01-05 13:11:00 14:35:27 386.158 4572.627 84.450 30.3 34.1 ... Gouda Hoek van Holland Opruimingswerkzaamheden [Geen oorzaakcode opgegeven door VWM 9], [Poli... [000], [M30], [OCL], [000] OCL Opruimingswerkzaamheden Opruimwerkzaamheden Incident (opruimwerkzaamheden) Incident
2 2922764 2022-01-05 2022-01-05 13:23:00 14:07:53 138.740 3091.125 44.883 146.5 148.6 ... Duitse grens Arnhem Defecte vrachtwagen(s) [Geen oorzaakcode opgegeven door VWM 7], [Defe... [000], [HBD], [M13] HBD Defecte vrachtwagen(s) Defecte vrachtwagen Incident (gestrand voertuig) Incident
3 2922771 2022-01-05 2022-01-05 14:00:00 14:33:28 123.883 3701.693 33.467 160.6 164.5 ... Maastricht Eindhoven File buiten spits (geen oorzaak gemeld) [Geen oorzaakcode opgegeven door VWM 33] [000] 3 File buiten spits (geen oorzaak gemeld) Geen oorzaak gemeld Drukte Hoge intensiteit
4 2922776 2022-01-05 2022-01-05 14:13:00 14:40:27 79.585 2899.271 27.450 142.0 145.3 ... Duitse grens Arnhem File buiten spits (geen oorzaak gemeld) [Geen oorzaakcode opgegeven door VWM 27] [000] 3 File buiten spits (geen oorzaak gemeld) Geen oorzaak gemeld Drukte Hoge intensiteit

5 rows × 26 columns

In [23]:
# For all files sum up GemLengte to have total amount of traffic jams due to high road intensities per month
sum_traffic = []
for i in range(1, 10):
    file_name = 'file_%i' % i
    locals()[file_name] = locals()[file_name][locals()[file_name]['Oorzaak_3'] == 'Drukte']
    sum_traffic.append(locals()[file_name]['GemLengte'].sum()/1000)
    
# Make a list with total amount of kilometres of traffic jams per month rounded. 
total_traffic = []
for i in sum_traffic:
    total_traffic.append(round(i))

print(total_traffic)
[4216, 23715, 38005, 19526, 10522, 36030, 17346, 20371, 84457]
In [24]:
# Make new dataframe for plotly express visuals
data2 = {'months': months,
       'total jam length [km]': total_traffic}

df7 = pd.DataFrame(data2)


# Make plotly express bar plot for traffic jams
fig7 = px.bar(df7, x = 'months', y = 'total jam length [km]', 
              color = 'months', title = 'Total amount of traffic in jams', 
              text = 'total jam length [km]')
fig7.show()


# Store figure for main document
%store fig7
Stored 'fig7' (Figure)
In [ ]: